0. Prerequesites

0.0 Libraries

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(reshape2)

# string processing
library(stringr)

# viz
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.2
## corrplot 0.84 loaded
library(grid)
library(ggridges)
## Warning: package 'ggridges' was built under R version 3.4.1

0.1 Helper functions

# Define multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {

  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }

 if (numPlots==1) {
    print(plots[[1]])

  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

1. First glance

Here, I’ll load the table, and take a first glance at the columns it contains.

Loading the data:

# load the table
train <- read_csv("../data/application_train.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   SK_ID_CURR = col_integer(),
##   TARGET = col_integer(),
##   NAME_CONTRACT_TYPE = col_character(),
##   CODE_GENDER = col_character(),
##   FLAG_OWN_CAR = col_character(),
##   FLAG_OWN_REALTY = col_character(),
##   CNT_CHILDREN = col_integer(),
##   NAME_TYPE_SUITE = col_character(),
##   NAME_INCOME_TYPE = col_character(),
##   NAME_EDUCATION_TYPE = col_character(),
##   NAME_FAMILY_STATUS = col_character(),
##   NAME_HOUSING_TYPE = col_character(),
##   DAYS_BIRTH = col_integer(),
##   DAYS_EMPLOYED = col_integer(),
##   DAYS_ID_PUBLISH = col_integer(),
##   FLAG_MOBIL = col_integer(),
##   FLAG_EMP_PHONE = col_integer(),
##   FLAG_WORK_PHONE = col_integer(),
##   FLAG_CONT_MOBILE = col_integer(),
##   FLAG_PHONE = col_integer()
##   # ... with 37 more columns
## )
## See spec(...) for full column specifications.

And taking a look a the column structure:

# get
str(train)
## Classes 'tbl_df', 'tbl' and 'data.frame':    307511 obs. of  122 variables:
##  $ SK_ID_CURR                  : int  100002 100003 100004 100006 100007 100008 100009 100010 100011 100012 ...
##  $ TARGET                      : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ NAME_CONTRACT_TYPE          : chr  "Cash loans" "Cash loans" "Revolving loans" "Cash loans" ...
##  $ CODE_GENDER                 : chr  "M" "F" "M" "F" ...
##  $ FLAG_OWN_CAR                : chr  "N" "N" "Y" "N" ...
##  $ FLAG_OWN_REALTY             : chr  "Y" "N" "Y" "Y" ...
##  $ CNT_CHILDREN                : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ AMT_INCOME_TOTAL            : num  202500 270000 67500 135000 121500 ...
##  $ AMT_CREDIT                  : num  406598 1293502 135000 312682 513000 ...
##  $ AMT_ANNUITY                 : num  24700 35698 6750 29686 21866 ...
##  $ AMT_GOODS_PRICE             : num  351000 1129500 135000 297000 513000 ...
##  $ NAME_TYPE_SUITE             : chr  "Unaccompanied" "Family" "Unaccompanied" "Unaccompanied" ...
##  $ NAME_INCOME_TYPE            : chr  "Working" "State servant" "Working" "Working" ...
##  $ NAME_EDUCATION_TYPE         : chr  "Secondary / secondary special" "Higher education" "Secondary / secondary special" "Secondary / secondary special" ...
##  $ NAME_FAMILY_STATUS          : chr  "Single / not married" "Married" "Single / not married" "Civil marriage" ...
##  $ NAME_HOUSING_TYPE           : chr  "House / apartment" "House / apartment" "House / apartment" "House / apartment" ...
##  $ REGION_POPULATION_RELATIVE  : num  0.0188 0.00354 0.01003 0.00802 0.02866 ...
##  $ DAYS_BIRTH                  : int  -9461 -16765 -19046 -19005 -19932 -16941 -13778 -18850 -20099 -14469 ...
##  $ DAYS_EMPLOYED               : int  -637 -1188 -225 -3039 -3038 -1588 -3130 -449 365243 -2019 ...
##  $ DAYS_REGISTRATION           : num  -3648 -1186 -4260 -9833 -4311 ...
##  $ DAYS_ID_PUBLISH             : int  -2120 -291 -2531 -2437 -3458 -477 -619 -2379 -3514 -3992 ...
##  $ OWN_CAR_AGE                 : num  NA NA 26 NA NA NA 17 8 NA NA ...
##  $ FLAG_MOBIL                  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ FLAG_EMP_PHONE              : int  1 1 1 1 1 1 1 1 0 1 ...
##  $ FLAG_WORK_PHONE             : int  0 0 1 0 0 1 0 1 0 0 ...
##  $ FLAG_CONT_MOBILE            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ FLAG_PHONE                  : int  1 1 1 0 0 1 1 0 0 0 ...
##  $ FLAG_EMAIL                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ OCCUPATION_TYPE             : chr  "Laborers" "Core staff" "Laborers" "Laborers" ...
##  $ CNT_FAM_MEMBERS             : num  1 2 1 2 1 2 3 2 2 1 ...
##  $ REGION_RATING_CLIENT        : int  2 1 2 2 2 2 2 3 2 2 ...
##  $ REGION_RATING_CLIENT_W_CITY : int  2 1 2 2 2 2 2 3 2 2 ...
##  $ WEEKDAY_APPR_PROCESS_START  : chr  "WEDNESDAY" "MONDAY" "MONDAY" "WEDNESDAY" ...
##  $ HOUR_APPR_PROCESS_START     : int  10 11 9 17 11 16 16 16 14 8 ...
##  $ REG_REGION_NOT_LIVE_REGION  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ REG_REGION_NOT_WORK_REGION  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ LIVE_REGION_NOT_WORK_REGION : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ REG_CITY_NOT_LIVE_CITY      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ REG_CITY_NOT_WORK_CITY      : int  0 0 0 0 1 0 0 1 0 0 ...
##  $ LIVE_CITY_NOT_WORK_CITY     : int  0 0 0 0 1 0 0 1 0 0 ...
##  $ ORGANIZATION_TYPE           : chr  "Business Entity Type 3" "School" "Government" "Business Entity Type 3" ...
##  $ EXT_SOURCE_1                : num  0.083 0.311 NA NA NA ...
##  $ EXT_SOURCE_2                : num  0.263 0.622 0.556 0.65 0.323 ...
##  $ EXT_SOURCE_3                : num  0.139 NA 0.73 NA NA ...
##  $ APARTMENTS_AVG              : num  0.0247 0.0959 NA NA NA NA NA NA NA NA ...
##  $ BASEMENTAREA_AVG            : num  0.0369 0.0529 NA NA NA NA NA NA NA NA ...
##  $ YEARS_BEGINEXPLUATATION_AVG : num  0.972 0.985 NA NA NA ...
##  $ YEARS_BUILD_AVG             : num  0.619 0.796 NA NA NA ...
##  $ COMMONAREA_AVG              : num  0.0143 0.0605 NA NA NA NA NA NA NA NA ...
##  $ ELEVATORS_AVG               : num  0 0.08 NA NA NA NA NA NA NA NA ...
##  $ ENTRANCES_AVG               : num  0.069 0.0345 NA NA NA NA NA NA NA NA ...
##  $ FLOORSMAX_AVG               : num  0.0833 0.2917 NA NA NA ...
##  $ FLOORSMIN_AVG               : num  0.125 0.333 NA NA NA ...
##  $ LANDAREA_AVG                : num  0.0369 0.013 NA NA NA NA NA NA NA NA ...
##  $ LIVINGAPARTMENTS_AVG        : num  0.0202 0.0773 NA NA NA NA NA NA NA NA ...
##  $ LIVINGAREA_AVG              : num  0.019 0.0549 NA NA NA NA NA NA NA NA ...
##  $ NONLIVINGAPARTMENTS_AVG     : num  0 0.0039 NA NA NA NA NA NA NA NA ...
##  $ NONLIVINGAREA_AVG           : num  0 0.0098 NA NA NA NA NA NA NA NA ...
##  $ APARTMENTS_MODE             : num  0.0252 0.0924 NA NA NA NA NA NA NA NA ...
##  $ BASEMENTAREA_MODE           : num  0.0383 0.0538 NA NA NA NA NA NA NA NA ...
##  $ YEARS_BEGINEXPLUATATION_MODE: num  0.972 0.985 NA NA NA ...
##  $ YEARS_BUILD_MODE            : num  0.634 0.804 NA NA NA ...
##  $ COMMONAREA_MODE             : num  0.0144 0.0497 NA NA NA NA NA NA NA NA ...
##  $ ELEVATORS_MODE              : num  0 0.0806 NA NA NA NA NA NA NA NA ...
##  $ ENTRANCES_MODE              : num  0.069 0.0345 NA NA NA NA NA NA NA NA ...
##  $ FLOORSMAX_MODE              : num  0.0833 0.2917 NA NA NA ...
##  $ FLOORSMIN_MODE              : num  0.125 0.333 NA NA NA ...
##  $ LANDAREA_MODE               : num  0.0377 0.0128 NA NA NA NA NA NA NA NA ...
##  $ LIVINGAPARTMENTS_MODE       : num  0.022 0.079 NA NA NA NA NA NA NA NA ...
##  $ LIVINGAREA_MODE             : num  0.0198 0.0554 NA NA NA NA NA NA NA NA ...
##  $ NONLIVINGAPARTMENTS_MODE    : num  0 0 NA NA NA NA NA NA NA NA ...
##  $ NONLIVINGAREA_MODE          : num  0 0 NA NA NA NA NA NA NA NA ...
##  $ APARTMENTS_MEDI             : num  0.025 0.0968 NA NA NA NA NA NA NA NA ...
##  $ BASEMENTAREA_MEDI           : num  0.0369 0.0529 NA NA NA NA NA NA NA NA ...
##  $ YEARS_BEGINEXPLUATATION_MEDI: num  0.972 0.985 NA NA NA ...
##  $ YEARS_BUILD_MEDI            : num  0.624 0.799 NA NA NA ...
##  $ COMMONAREA_MEDI             : num  0.0144 0.0608 NA NA NA NA NA NA NA NA ...
##  $ ELEVATORS_MEDI              : num  0 0.08 NA NA NA NA NA NA NA NA ...
##  $ ENTRANCES_MEDI              : num  0.069 0.0345 NA NA NA NA NA NA NA NA ...
##  $ FLOORSMAX_MEDI              : num  0.0833 0.2917 NA NA NA ...
##  $ FLOORSMIN_MEDI              : num  0.125 0.333 NA NA NA ...
##  $ LANDAREA_MEDI               : num  0.0375 0.0132 NA NA NA NA NA NA NA NA ...
##  $ LIVINGAPARTMENTS_MEDI       : num  0.0205 0.0787 NA NA NA NA NA NA NA NA ...
##  $ LIVINGAREA_MEDI             : num  0.0193 0.0558 NA NA NA NA NA NA NA NA ...
##  $ NONLIVINGAPARTMENTS_MEDI    : num  0 0.0039 NA NA NA NA NA NA NA NA ...
##  $ NONLIVINGAREA_MEDI          : num  0 0.01 NA NA NA NA NA NA NA NA ...
##  $ FONDKAPREMONT_MODE          : chr  "reg oper account" "reg oper account" NA NA ...
##  $ HOUSETYPE_MODE              : chr  "block of flats" "block of flats" NA NA ...
##  $ TOTALAREA_MODE              : num  0.0149 0.0714 NA NA NA NA NA NA NA NA ...
##  $ WALLSMATERIAL_MODE          : chr  "Stone, brick" "Block" NA NA ...
##  $ EMERGENCYSTATE_MODE         : chr  "No" "No" NA NA ...
##  $ OBS_30_CNT_SOCIAL_CIRCLE    : num  2 1 0 2 0 0 1 2 1 2 ...
##  $ DEF_30_CNT_SOCIAL_CIRCLE    : num  2 0 0 0 0 0 0 0 0 0 ...
##  $ OBS_60_CNT_SOCIAL_CIRCLE    : num  2 1 0 2 0 0 1 2 1 2 ...
##  $ DEF_60_CNT_SOCIAL_CIRCLE    : num  2 0 0 0 0 0 0 0 0 0 ...
##  $ DAYS_LAST_PHONE_CHANGE      : num  -1134 -828 -815 -617 -1106 ...
##  $ FLAG_DOCUMENT_2             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ FLAG_DOCUMENT_3             : int  1 1 0 1 0 1 0 1 1 0 ...
##  $ FLAG_DOCUMENT_4             : int  0 0 0 0 0 0 0 0 0 0 ...
##   [list output truncated]
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 122
##   .. ..$ SK_ID_CURR                  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ TARGET                      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ NAME_CONTRACT_TYPE          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ CODE_GENDER                 : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ FLAG_OWN_CAR                : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ FLAG_OWN_REALTY             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ CNT_CHILDREN                : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ AMT_INCOME_TOTAL            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ AMT_CREDIT                  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ AMT_ANNUITY                 : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ AMT_GOODS_PRICE             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ NAME_TYPE_SUITE             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ NAME_INCOME_TYPE            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ NAME_EDUCATION_TYPE         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ NAME_FAMILY_STATUS          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ NAME_HOUSING_TYPE           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ REGION_POPULATION_RELATIVE  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ DAYS_BIRTH                  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ DAYS_EMPLOYED               : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ DAYS_REGISTRATION           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ DAYS_ID_PUBLISH             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ OWN_CAR_AGE                 : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ FLAG_MOBIL                  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ FLAG_EMP_PHONE              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ FLAG_WORK_PHONE             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ FLAG_CONT_MOBILE            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ FLAG_PHONE                  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ FLAG_EMAIL                  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ OCCUPATION_TYPE             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ CNT_FAM_MEMBERS             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ REGION_RATING_CLIENT        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ REGION_RATING_CLIENT_W_CITY : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ WEEKDAY_APPR_PROCESS_START  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ HOUR_APPR_PROCESS_START     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ REG_REGION_NOT_LIVE_REGION  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ REG_REGION_NOT_WORK_REGION  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ LIVE_REGION_NOT_WORK_REGION : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ REG_CITY_NOT_LIVE_CITY      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ REG_CITY_NOT_WORK_CITY      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ LIVE_CITY_NOT_WORK_CITY     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ ORGANIZATION_TYPE           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ EXT_SOURCE_1                : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ EXT_SOURCE_2                : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ EXT_SOURCE_3                : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ APARTMENTS_AVG              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ BASEMENTAREA_AVG            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ YEARS_BEGINEXPLUATATION_AVG : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ YEARS_BUILD_AVG             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ COMMONAREA_AVG              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ ELEVATORS_AVG               : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ ENTRANCES_AVG               : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ FLOORSMAX_AVG               : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ FLOORSMIN_AVG               : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ LANDAREA_AVG                : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ LIVINGAPARTMENTS_AVG        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ LIVINGAREA_AVG              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ NONLIVINGAPARTMENTS_AVG     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ NONLIVINGAREA_AVG           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ APARTMENTS_MODE             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ BASEMENTAREA_MODE           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ YEARS_BEGINEXPLUATATION_MODE: list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ YEARS_BUILD_MODE            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ COMMONAREA_MODE             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ ELEVATORS_MODE              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ ENTRANCES_MODE              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ FLOORSMAX_MODE              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ FLOORSMIN_MODE              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ LANDAREA_MODE               : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ LIVINGAPARTMENTS_MODE       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ LIVINGAREA_MODE             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ NONLIVINGAPARTMENTS_MODE    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ NONLIVINGAREA_MODE          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ APARTMENTS_MEDI             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ BASEMENTAREA_MEDI           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ YEARS_BEGINEXPLUATATION_MEDI: list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ YEARS_BUILD_MEDI            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ COMMONAREA_MEDI             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ ELEVATORS_MEDI              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ ENTRANCES_MEDI              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ FLOORSMAX_MEDI              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ FLOORSMIN_MEDI              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ LANDAREA_MEDI               : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ LIVINGAPARTMENTS_MEDI       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ LIVINGAREA_MEDI             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ NONLIVINGAPARTMENTS_MEDI    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ NONLIVINGAREA_MEDI          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ FONDKAPREMONT_MODE          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ HOUSETYPE_MODE              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ TOTALAREA_MODE              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ WALLSMATERIAL_MODE          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ EMERGENCYSTATE_MODE         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ OBS_30_CNT_SOCIAL_CIRCLE    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ DEF_30_CNT_SOCIAL_CIRCLE    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ OBS_60_CNT_SOCIAL_CIRCLE    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ DEF_60_CNT_SOCIAL_CIRCLE    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ DAYS_LAST_PHONE_CHANGE      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ FLAG_DOCUMENT_2             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ FLAG_DOCUMENT_3             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ FLAG_DOCUMENT_4             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. .. [list output truncated]
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

This table has 307511 observations of 122 features, of a wide variety. Many of the feature names are self explanitory and intuitively informative of a creditor’s solvency (for example, FLAG_OWN_CAR is a binary feature which determines if the client owns a car). For other features, however, the meaning of the feature is unclear from the name, or the relevence to a creditor’s solvency may not be obvious (e.g. EXT_SOURCE_1, APARTMENTS_AVG, FLAG_DOCUMENT_2.)

2. Feature distributions and meanings

The next order of business is to comb through each of the features in this table, and get a rough picture of their distributions. For the features whose meaning are not obvious from their names, I’ll also try and get a better idea of what they encode.

This is a tedious process, and only a rough exploration of the features. In later sections I’ll deeper into how these features relate, and in future notebooks I’ll look into how the features in this table relate to the features of other tables.

2.1 TARGET

The target column is a binary variable - presumably indicating whether or not the client defaulted on his/her loan.

train %>%
      count(TARGET) %>%
      rename(count = n) %>%
      mutate(proportion = count/sum(count)) %>%
      mutate(proportion = round(proportion,2)) %>%
      ggplot(aes(x = TARGET, y = count, fill = TARGET, label = proportion)) + 
      geom_col(show.legend = F) + 
      coord_flip() + 
      geom_text()

Around 8% of the responses are 1, and the remaining 92% are 0.

From this it’s not clear if a positive of TARGET means that the client defaulted or not. We can look at the correlation between the target each of the numeric features to see if any features which are obviously related to increased/decreased risk of default are correlated to the target.

p1 <- train %>%
      select_if(is.numeric) %>%
      select(-SK_ID_CURR) %>%
      cor(use = "pairwise.complete.obs") %>%
      melt() %>%
      filter(Var2 == "TARGET" & Var1 != "TARGET") %>%
      rename(correlation = value, Variable = Var1) %>%
      mutate(positive = ifelse(correlation > 0, TRUE, FALSE)) %>%
      ggplot(aes(x = reorder(Variable, -correlation), y = correlation, fill = positive)) + 
      geom_col(show.legend = FALSE) + 
      coord_flip() + 
      ggtitle("Correlation between variables and TARGET")
## Warning in cor(., use = "pairwise.complete.obs"): the standard deviation is
## zero
p1

Here we can see that features usually associated with solvency like DAYS_EMPLOYED and AMT_GOODS_PRICE are negatively correlated with the target. This makes me believe that TARGET = 1 means that the client defaulted on a loan, while TARGET = 0 means that he did not.

This means that only 8% of the clients defaulted on their loans. This class imbalance will surely be a challenge in building a good classifier, and will be something we will have to consider carefully in the future.

2.2 NAME_CONTRACT_TYPE

train %>%
      count(NAME_CONTRACT_TYPE, sort = T) %>%
      rename(count = n) %>%
      mutate(proportion = count/sum(count))

There are only two types of loans in the training set - cash loans (which account for 90% of the training data) and revolving loans (the remaining 10%).

Revolving loans are arrangements in which the agreed loan amount can be withdrawn, repaid and redrawn again in number of times (cite), while a cash loan presumably means that the loan amount is given up front, and the client is expected to repay it in full at the end of the loan period.

Revoling loans are therefore much more flexible. it’s safe to assume that Home Credit is more discerning when deciding who is eligible for these types of loans

train %>%
      group_by(NAME_CONTRACT_TYPE) %>%
      summarize(proportion.default = mean(TARGET)) 

Indeed, applicants who recieve Revolving Loans have a lower default rate than those who get cash loans.

One can ask - which type of applicant is more likely to recieve a Revolving loan, and which is more likely to recieve a Cash loan? If indeed Home Credit only gives Revolving loans to the most trustworthy-looking clients, however, this question is probably intimately related to the client’s ability to repay the loan.

In fact, it may be the case that if a client recieves a Revolving loan, its because the models Home Credit developed predicts that the client is likely to repay their loan. Thus, using this feature may be a way of transfering the insight of Home Credit’s models into our own models.


Looking at how the various numeric features correlate with the target variable and with an applicant’s likelyhood of getting a Revolving loan, we see some interesting similarities and differences (see graph below).

For example, high values of AMT_CREDIT and AMT_ANNUITY is negatively correlated with both TARGET and the likelyhood of getting a revolving loan, although the correlation is much stronger for the latter. However, having a positive value for the feature FLAG_DOCUMENT_3 is much more negatively correlated with the likelyhood of getting a revolving loan than with the target feature (in fact, FLAG_DOCUMENT_3 is positively correlated withTARGET); perhaps the presense/absense of this document is directly liked to a client’s eligiblity to get a revolving loan.

p2 <- train %>%
      mutate(revolving = ifelse(NAME_CONTRACT_TYPE == "Revolving loans", 1, 0)) %>%
      select_if(is.numeric) %>%
      select(-SK_ID_CURR) %>%
      cor(use = "pairwise.complete.obs") %>%
      melt() %>%
      filter(Var2 == "revolving" & Var1 != "revolving") %>%
      rename(correlation = value, Variable = Var1) %>%
      mutate(positive = ifelse(correlation > 0, TRUE, FALSE)) %>%
      ggplot(aes(x = reorder(Variable, -correlation), y = correlation, fill = positive)) + 
      geom_col(show.legend = FALSE) + 
      coord_flip() + 
      ggtitle("Correlation between variables and REVOVLING")
## Warning in cor(., use = "pairwise.complete.obs"): the standard deviation is
## zero
multiplot(p1,p2)

It’s interesting how AMT_ANNUITY is so negativley correlated with recieving a revolving loan. Perhaps if a client has a large annuity contract with an insurance company, they are believed to be less liquid, and therefore less likely to repay a revolving loan? Any ideas? I’ll have to look into this some more.

2.3 CODE_GENDER

train %>%
      group_by(CODE_GENDER) %>%
      summarize(count = n(),
                mean.target = mean(TARGET)) %>%
      ungroup() %>%
      mutate(proportion = round(count/sum(count),3))

65% of applicants are female. Almost all of the remaining applicants are male - except for for applicants which selected some sort of “other” category.

Interestingly, on a global level, it 10% of males default on their loans, while 7% of females default on theirs. I find this type of disparity - which may seem easy to understand on the surface - very intersting to investigate further. Therefore, I’m going to to go on a bit of a tangent, and try to identify what is different between male and female candidates, in hopes of identifying why males seem more likely to default than women.

2.3.0 Loan type

There doesn’t seem to be a difference in one’s likelyhood to recieve a cash or revolving loan based on gender, and so this doesn’t explain the diference in the gender’s solvencies.

train %>%
      group_by(CODE_GENDER, NAME_CONTRACT_TYPE) %>%
      count() %>%
      group_by(CODE_GENDER) %>%
      mutate(proportion = n/sum(n)) %>%
      ungroup() %>%
      filter(CODE_GENDER != "XNA") %>%
      ggplot(aes(x = NAME_CONTRACT_TYPE, y = proportion, fill = CODE_GENDER)) + 
      geom_col(position = "dodge") +
      ggtitle("Likelyhood of Males/Females rcieving Cash/Revolving loans")

2.3.1 Income inequality

train %>%
      ggplot(aes(x = AMT_INCOME_TOTAL)) + 
      geom_density() + 
      scale_x_log10() +
      ggtitle("Income Density (log-scale)")

Income is generally very positively skewed - hence the log scale used above. With the exception of a few very rich applicants, most applicants earn less than $1,000,000 in total income.

Focusing on just these clients and spitting the income density, we see evidence of gender inequality in total income (see graph below).

p3 = train %>%
      filter(CODE_GENDER != "XNA") %>%
      filter(AMT_INCOME_TOTAL <= 1e6) %>%
      ggplot(aes(x = AMT_INCOME_TOTAL, fill = CODE_GENDER))  + 
      geom_density(alpha = .6, show.legend = F) + 
      scale_x_log10() +
      ggtitle("Income Density (log-scale)")

p4 = train %>%
      filter(CODE_GENDER != "XNA") %>%
      filter(AMT_INCOME_TOTAL <= 1e6) %>%
      ggplot(aes(x = AMT_INCOME_TOTAL, fill = CODE_GENDER))  + 
      geom_density(alpha = .6, position = "fill") + 
      scale_x_log10() +
      ggtitle("Income Proportion (log-scale)") + 
      geom_vline(xintercept = 150000, linetype = "dashed", color = "red")

multiplot(p3, p4, cols = 2)

Men’s salaries are shifted towards higher salaries, and the densities of men’s and women’s salaries have a similar shape. Men are more likely to recieve very high salaries: although men account for only 35% of the applicants in the training set, they hold 42% of positions that pay greater than $150,000, 49% of positions that pay greater than $250,000, and 55% of positions that pay greater than $500,000.

This may be evidence that high paying leadership positions are generaly reserved for men - which is consistent with our knowledge of present inequalities in the workplace.

Although this phenomenon is interesting, it does not explain on its own why men seem to default on their loans more f frequently.

2.3.2 More inequality? AMT_GOODS_PRICE, AMT_CREDIT and AMT_ANNUITY

The distributions of the features AMT_GOODS_PRICE, AMT_CREDIT and AMT_ANNUITY are also positively skewed, and fairly mound shaped on the log-scale:

p5 = train %>%
      ggplot(aes(x = AMT_GOODS_PRICE)) + 
      geom_density() +
      scale_x_log10()  +
      geom_vline(xintercept = 2.6e6, linetype = "dashed", color = "red") + 
      ggtitle("Goods Price (log-scale)")

p6 = train %>%
      ggplot(aes(x = AMT_CREDIT)) + 
      geom_density()  +
      scale_x_log10() +
      geom_vline(xintercept = 3e6, linetype = "dashed", color = "red") + 
      ggtitle("Credit amount (log-scale)")

p7 = train %>%
      ggplot(aes(x = AMT_ANNUITY)) + 
      geom_density()  +
      scale_x_log10() +
      geom_vline(xintercept = 1.3e5, linetype = "dashed", color = "red") +
      geom_vline(xintercept = 3.5e3, linetype = "dashed", color = "red") +
      ggtitle("Annuity amount (log-scale)")
      
multiplot(p5,p6,p7, cols = 1)
## Warning: Removed 278 rows containing non-finite values (stat_density).
## Warning: Removed 12 rows containing non-finite values (stat_density).

Keeping only those records within the red bands in the graphs above goods, and again splitting by gender, we see little evidence of gender inequality in the Goods prices and Credit amounts (first and second rows of graph below).

When we look at the annuity amount, however, we see a similar phenomenon as the one observed in total income, split by gender; the shape of the annuity amounts - split by gender - are roughly the same, but the male annuities are slightly shifted towards higher values (third row in graph below).

p8 = train %>%
      filter(AMT_GOODS_PRICE <= 2.6e6) %>%
      filter(CODE_GENDER != "XNA") %>%
      ggplot(aes(x = AMT_GOODS_PRICE, fill = CODE_GENDER))  + 
      geom_density(alpha = .6, show.legend = F) + 
      scale_x_log10() +
      ggtitle("Goods Price Density (log-scale)")

p9 = train %>%
      filter(AMT_GOODS_PRICE <= 2.6e6) %>%
      filter(CODE_GENDER != "XNA") %>%
      ggplot(aes(x = AMT_GOODS_PRICE, fill = CODE_GENDER))  + 
      geom_density(alpha = .6, position = "fill", show.legend = FALSE) + 
      scale_x_log10() +
      ggtitle("Goods Price Proportion (log-scale)") 

p10 = train %>%
      filter(AMT_CREDIT <= 3e6) %>%
      filter(CODE_GENDER != "XNA") %>%
      ggplot(aes(x = AMT_CREDIT, fill = CODE_GENDER))  + 
      geom_density(alpha = .6, show.legend = F) + 
      scale_x_log10() +
      ggtitle("Credit Density (log-scale)")

p11 = train %>%
      filter(AMT_CREDIT <= 3e6) %>%
      filter(CODE_GENDER != "XNA") %>%
      ggplot(aes(x = AMT_CREDIT, fill = CODE_GENDER))  + 
      geom_density(alpha = .6, position = "fill", show.legend = FALSE) + 
      scale_x_log10() +
      ggtitle("Credit Proportion (log-scale)")

p12 = train %>%
      filter(between(AMT_ANNUITY, 3.5e3, 1.3e5)) %>%
      filter(CODE_GENDER != "XNA") %>%
      ggplot(aes(x = AMT_ANNUITY, fill = CODE_GENDER))  + 
      geom_density(alpha = .6, show.legend = F) + 
      scale_x_log10() +
      ggtitle("Annuity amount Density (log-scale)")

p13 = train %>%
      filter(between(AMT_ANNUITY, 3.5e3, 1.3e5)) %>%
      filter(CODE_GENDER != "XNA") %>%
      ggplot(aes(x = AMT_ANNUITY, fill = CODE_GENDER))  + 
      geom_density(alpha = .6, position = "fill", show.legend = FALSE) + 
      scale_x_log10() +
      ggtitle("Annuity amount Proportion (log-scale)")

multiplot(p8, p9, p10, p11, p12, p13,layout = matrix(c(1,2,3,4,5,6), ncol = 2,byrow = TRUE))

Again - although interesting - the pattern of higher annuity for men than women does not increase the apparent pattern of lower solvency of men than women. In fact - the overall correlation between annuity ammount and likelihood to default is negative (although close to zero in absolute value).


We see some interesting inequalities between men and women with regard to Income, Credit, Annuity, and Goods. Perhaps the interactions distributions of these two features - and how they affect likelihood to default differently for men and women - may give us insight into why men default on their loans more often.

Below are six plots, one for each of the pairs features in {AMT_INCOME_TOTAL, AMT_GOODS_PRICE, AMT_CREDIT, AMT_ANNUITY}. The first row of each plot shows a 2D density plot of the pair of features, split by gender. The heat maps (tile charts) in the second rows of each figure are more involved: they are created using the following procedure:

  • Split each (log) feature into 20 equally sized bins.
  • Compute the number of records that fall into each pair of bins, for the two features. This will correspond with the opacity of the final tiles.
  • Compute the average default rate within the bins. This will correspond with the color of the tiles.
  • Split the heatmap by gender.

I’m hoping to see one of two interesting things in these visualizations: first, it would be intersting to observe differences in the 2D distributions of the features for men and women. Even better yet - I’m looking to see if these differences in distributions also correspond to areas of very different default rates - for example, if women tend to have low income but high annuity more frequently than men, and this charactaristic is also associated with low default rates (not sure if this is true yet…)

# number of bins to split continuous variables into
bins = 20
tmp = train %>%
      select(AMT_INCOME_TOTAL, AMT_CREDIT, AMT_GOODS_PRICE, AMT_ANNUITY, CODE_GENDER, TARGET)  %>%
      mutate(income.log = log(AMT_INCOME_TOTAL), 
             credit.log = log(AMT_CREDIT), 
             goods.log = log(AMT_GOODS_PRICE), 
             annuity.log = log(AMT_ANNUITY)) %>%
      mutate(income.bin = cut(income.log, bins), 
             credit.bin = cut(credit.log, bins),
             goods.bin = cut(goods.log, bins), 
             annuity.bin = cut(annuity.log, bins))
Income vs Credit
p14 <- tmp %>%
      filter(CODE_GENDER != "XNA") %>%
      ggplot(aes(x = income.log, y = credit.log)) + 
      stat_density_2d(aes(fill = ..level..), geom = "polygon", show.legend = FALSE) + 
      facet_wrap(~CODE_GENDER) +
      ggtitle("Densities over Income and Credit")


p15 <- tmp %>%
      group_by(income.bin, credit.bin, CODE_GENDER) %>%
      filter(CODE_GENDER != "XNA") %>%
      summarize(target.mean = mean(TARGET), 
                count = n()) %>%
      ungroup() %>%
      group_by(CODE_GENDER) %>%
      mutate(proportion = count/sum(count)) %>%
      ungroup() %>%
      filter(count > 1) %>%
      ggplot(aes(x = income.bin, y = credit.bin, fill = target.mean, alpha = proportion)) + 
      geom_tile() + 
      scale_fill_distiller(palette = "Spectral") + 
      scale_alpha(range = c(0.2, 1)) + 
      ggtitle("Income and Credit bins, versus target") + 
      facet_wrap(~CODE_GENDER) + 
      theme(axis.text.x = element_text(angle = 45, hjust = 1))

multiplot(p14, p15, cols = 1)

Again, we can see that men tend to have higher income and slightly higher credit than women. In terms of the higher default rate, I don’t see a clear indication that high credit and high income leads to high default rate - it looks like men have higher default rates evenly across the distributions of income and credit.

Income vs Annuity
p16 <- tmp %>%
      filter(CODE_GENDER != "XNA") %>%
      ggplot(aes(x = income.log, y = annuity.log)) + 
      stat_density_2d(aes(fill = ..level..), geom = "polygon", show.legend = FALSE) + 
      facet_wrap(~CODE_GENDER) +
      ggtitle("Densities over Income and Annuity")


p17 <- tmp %>%
      group_by(income.bin, annuity.bin, CODE_GENDER) %>%
      filter(CODE_GENDER != "XNA") %>%
      summarize(target.mean = mean(TARGET), 
                count = n()) %>%
      ungroup() %>%
      group_by(CODE_GENDER) %>%
      mutate(proportion = count/sum(count)) %>%
      ungroup() %>%
      filter(count > 1) %>%
      ggplot(aes(x = income.bin, y = annuity.bin, fill = target.mean, alpha = proportion)) + 
      geom_tile() + 
      scale_fill_distiller(palette = "Spectral") + 
      scale_alpha(range = c(0.2, 1)) + 
      ggtitle("Income and Annuity bins, versus target") + 
      facet_wrap(~CODE_GENDER) + 
      theme(axis.text.x = element_text(angle = 45, hjust = 1))

multiplot(p16, p17, cols = 1)
## Warning: Removed 12 rows containing non-finite values (stat_density2d).

Again, we can see that men clearly tend to have higher annuity and income; the “hot” spots in the density plot in the first row tend to be shifted up and right for men compared to women. The higher density in this corner, however, does not seem to correspond with pockets of higher default rates.

Income vs Goods
p18 <- tmp %>%
      filter(CODE_GENDER != "XNA") %>%
      ggplot(aes(x = income.log, y = goods.log)) + 
      stat_density_2d(aes(fill = ..level..), geom = "polygon", show.legend = FALSE) + 
      facet_wrap(~CODE_GENDER) +
      ggtitle("Densities over Income and Goods")


p19 <- tmp %>%
      group_by(income.bin, goods.bin, CODE_GENDER) %>%
      filter(CODE_GENDER != "XNA") %>%
      summarize(target.mean = mean(TARGET), 
                count = n()) %>%
      ungroup() %>%
      group_by(CODE_GENDER) %>%
      mutate(proportion = count/sum(count)) %>%
      ungroup() %>%
      filter(count > 1) %>%
      ggplot(aes(x = income.bin, y = goods.bin, fill = target.mean, alpha = proportion)) + 
      geom_tile(show.legend = F) + 
      scale_fill_distiller(palette = "Spectral") + 
      scale_alpha(range = c(0.2, 1)) + 
      ggtitle("Income and Goods bins, versus target") + 
      facet_wrap(~CODE_GENDER) + 
      theme(axis.text.x = element_text(angle = 45, hjust = 1))

multiplot(p18, p19, cols = 1)
## Warning: Removed 278 rows containing non-finite values (stat_density2d).

Nothing particularly exciting here.

Credit vs Annuity
p20 <- tmp %>%
      filter(CODE_GENDER != "XNA") %>%
      ggplot(aes(x = credit.log, y = annuity.log)) + 
      stat_density_2d(aes(fill = ..level..), geom = "polygon", show.legend = FALSE) + 
      facet_wrap(~CODE_GENDER) +
      ggtitle("Densities over Credit and Annuity")


p21 <- tmp %>%
      group_by(credit.bin, annuity.bin, CODE_GENDER) %>%
      filter(CODE_GENDER != "XNA") %>%
      summarize(target.mean = mean(TARGET), 
                count = n()) %>%
      ungroup() %>%
      group_by(CODE_GENDER) %>%
      mutate(proportion = count/sum(count)) %>%
      ungroup() %>%
      filter(count > 1) %>%
      ggplot(aes(x = credit.bin, y = annuity.bin, fill = target.mean, alpha = proportion)) + 
      geom_tile() + 
      scale_fill_distiller(palette = "Spectral") + 
      scale_alpha(range = c(0.2, 1)) + 
      ggtitle("Credit and Annuity bins, versus target") + 
      facet_wrap(~CODE_GENDER) + 
      theme(axis.text.x = element_text(angle = 45, hjust = 1))

multiplot(p20, p21, cols = 1)
## Warning: Removed 12 rows containing non-finite values (stat_density2d).

Nothing revealing about the differences in default rate relating to densities here. It is interesting that credit and annuity are so highly correlated, however. We’ll look into this later.

Credit vs Goods
p22 <- tmp %>%
      filter(CODE_GENDER != "XNA") %>%
      ggplot(aes(x = credit.log, y = goods.log)) + 
      stat_density_2d(aes(fill = ..level..), geom = "polygon", show.legend = FALSE) + 
      facet_wrap(~CODE_GENDER) +
      ggtitle("Densities over Credit and Goods")


p23 <- tmp %>%
      group_by(credit.bin, goods.bin, CODE_GENDER) %>%
      summarize(target.mean = mean(TARGET), 
                count = n()) %>%
      ungroup() %>%
      group_by(CODE_GENDER) %>%
      mutate(proportion = count/sum(count)) %>%
      ungroup() %>%
      filter(count > 1) %>%
      ggplot(aes(x = credit.bin, y = goods.bin, fill = target.mean, alpha = proportion)) + 
      geom_tile() + 
      scale_fill_distiller(palette = "Spectral") + 
      scale_alpha(range = c(0.2, 1)) + 
      ggtitle("Credit and Goods bins, versus target") + 
      facet_wrap(~CODE_GENDER) + 
      theme(axis.text.x = element_text(angle = 45, hjust = 1))

multiplot(p22, p23, cols = 1)
## Warning: Removed 278 rows containing non-finite values (stat_density2d).

The densities of credit and goods prices are very similar for men and women. Again, these two features are very highly correlated - more later.

Annuity vs Goods
p24 <- tmp %>%
      filter(CODE_GENDER != "XNA") %>%
      ggplot(aes(x = annuity.log, y = goods.log)) + 
      stat_density_2d(aes(fill = ..level..), geom = "polygon", show.legend = FALSE) + 
      facet_wrap(~CODE_GENDER) + 
      ggtitle("Densities over Annuity and Goods")


p25 <- tmp %>%
      group_by(annuity.bin, goods.bin, CODE_GENDER) %>%
      summarize(target.mean = mean(TARGET), 
                count = n()) %>%
      ungroup() %>%
      group_by(CODE_GENDER) %>%
      mutate(proportion = count/sum(count)) %>%
      ungroup() %>%
      group_by(CODE_GENDER) %>%
      mutate(proportion = count/sum(count)) %>%
      ungroup() %>%
      filter(count > 1) %>%
      ggplot(aes(x = annuity.bin, y = goods.bin, fill = target.mean, alpha = proportion)) + 
      geom_tile(show.legend = F) + 
      scale_fill_distiller(palette = "Spectral") + 
      scale_alpha(range = c(0.2, 1)) + 
      ggtitle("Annuity and Goods bins, versus target") + 
      facet_wrap(~CODE_GENDER) + 
      theme(axis.text.x = element_text(angle = 45, hjust = 1))

multiplot(p24, p25, cols = 1)
## Warning: Removed 290 rows containing non-finite values (stat_density2d).

Here, we notice that its more common for men to have very low Goods price, but low/medium annuity. From the heatmaps, it seems like low goods prices and annuity tends to lead to higher default rates. Very few applications fall under this category, however, so this evidence is very shakey.


Unfortunately, just looking at income, annuity, credit, and goods prices doesnt reveal what it is about men that leads to overall higher default rates. To get to the bottom of it, I’ll need to dig a bit deeper into what’s different between men and women in this dataset.

2.3.4 Occupation differences between men and women

One potential explanation for the difference between male and female solvency could be differences in occupations frequently occupied by males and females, and how occupations relate to solvency. This idea is the topic of this subsection.


Looking at the distribution of males and females in the various occupations (below), we see that the majority of applicants occupy one of the positions Working, State Servant, Pensioner, or Commercial Associate (top left). I’ll refer to these occupations as the “big four” occupations.

People who are either on Maternity leave or unemployed have a much higher likelyhood of defaulting on their debts (top right), though these occupations constitute a very small minority of the applicants. Of the big four occupations, the default rate hovers around 8%, though those of the occupation working default at a slightly higher rate, and those with the occupations state servant and pensioner default at a lower rate.

Although 60% of all applicants of the Working occupation are female (bottom left), 60% of male applicants are of this occupation, while only 47% of female applicants are of this occupation (bottom right). In other words, although most “workers” are female, a larger proportion of males are workers than that of females. Since people of the working occupation have a slightly higher default rate, this may explain why males have a higher global default rate. Although, we cannot know in which direction the causation lies (is male default rate higher because most males are workers, or are worker default rates high because many workers are male???)

Along this same logic: A larger proportion of females are state servants and pensioners, compared to the proportion of males of these occupations. Since these occupations have slightly lower default rates, this could explain why females have lower default rates (same warning as before.)

p26 <- train %>%
      filter(CODE_GENDER != "XNA") %>%
      count(NAME_INCOME_TYPE, CODE_GENDER) %>%
      rename(count = n) %>%
      ggplot(aes(x = NAME_INCOME_TYPE, y = count, fill = CODE_GENDER)) + 
      geom_col(position = "dodge") + 
      coord_flip() + 
      ggtitle("Employment type - global count")

p27 <- train %>%
      filter(CODE_GENDER != "XNA") %>%
      count(NAME_INCOME_TYPE, CODE_GENDER)  %>%
      rename(count = n) %>%
      group_by(NAME_INCOME_TYPE) %>%
      mutate(proportion = count/sum(count)) %>%
      ungroup() %>%
      mutate(label = paste(round(proportion*100,3), "%")) %>%
      ggplot(aes(x = NAME_INCOME_TYPE, y = count, fill = CODE_GENDER, label = label)) +
      geom_col(position = "fill", show.legend = F) + 
      coord_flip() + 
      geom_text(position = position_fill()) + 
      ggtitle("Employ. Type. Percentage by gender")


p28 <- train %>%
      filter(CODE_GENDER != "XNA") %>%
      count(NAME_INCOME_TYPE, CODE_GENDER)  %>%
      rename(count = n) %>%
      group_by(CODE_GENDER) %>%
      mutate(proportion = count/sum(count)) %>%
      ungroup() %>%
      mutate(proportion = round(proportion, 3)) %>%
      mutate(label = paste(proportion*100, "%")) %>%
      ggplot(aes(x = NAME_INCOME_TYPE, y = proportion, fill = CODE_GENDER, label = label)) +
      geom_col(position = "dodge", show.legend = F) + 
      coord_flip() + 
      geom_text(position = position_dodge(width = 0.9)) + 
      ggtitle("Within gender emply. type perc.")

p29 <- train %>%
      count(NAME_INCOME_TYPE, TARGET) %>% 
      mutate(TARGET = as.factor(TARGET)) %>%
      ggplot(aes(x = NAME_INCOME_TYPE, y = n, fill = TARGET)) + 
      geom_col(position = "fill") + 
      coord_flip() + 
      scale_fill_discrete( h = c(50,1000)) + 
      geom_hline(yintercept = .08, linetype = "dashed")

multiplot(p26, p29, p27, p28, layout = matrix(c(1,1,2,3,4,4), byrow = T, nrow = 2))